{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.0.0-rc1\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from tqdm import tqdm\n",
    "import tensorflow as tf\n",
    "import tensorflow.keras.backend as K\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "import os\n",
    "from transformers import *\n",
    "print(tf.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train shape = (99913, 7)\n",
      "test shape = (10000, 6)\n"
     ]
    }
   ],
   "source": [
    "TRAIN_PATH = './data/train_dataset/'\n",
    "TEST_PATH = './data/test_dataset/'\n",
    "BERT_PATH = './bert_base_chinese/'\n",
    "MAX_SEQUENCE_LENGTH = 140\n",
    "input_categories = '微博中文内容'\n",
    "output_categories = '情感倾向'\n",
    "\n",
    "df_train = pd.read_csv(TRAIN_PATH+'nCoV_100k_train.labled.csv',engine ='python')\n",
    "df_train = df_train[df_train[output_categories].isin(['-1','0','1'])]\n",
    "df_test = pd.read_csv(TEST_PATH+'nCov_10k_test.csv',engine ='python')\n",
    "df_sub = pd.read_csv(TEST_PATH+'submit_example.csv')\n",
    "print('train shape =', df_train.shape)\n",
    "print('test shape =', df_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _convert_to_transformer_inputs(instance, tokenizer, max_sequence_length):\n",
    "    \"\"\"Converts tokenized input to ids, masks and segments for transformer (including bert)\"\"\"\n",
    "    \n",
    "    def return_id(str1, truncation_strategy, length):\n",
    "\n",
    "        inputs = tokenizer.encode_plus(str1,\n",
    "            add_special_tokens=True,\n",
    "            max_length=length,\n",
    "            truncation_strategy=truncation_strategy)\n",
    "        \n",
    "        input_ids =  inputs[\"input_ids\"]\n",
    "        input_masks = [1] * len(input_ids)\n",
    "        input_segments = inputs[\"token_type_ids\"]\n",
    "        padding_length = length - len(input_ids)\n",
    "        padding_id = tokenizer.pad_token_id\n",
    "        input_ids = input_ids + ([padding_id] * padding_length)\n",
    "        input_masks = input_masks + ([0] * padding_length)\n",
    "        input_segments = input_segments + ([0] * padding_length)\n",
    "        \n",
    "        return [input_ids, input_masks, input_segments]\n",
    "    \n",
    "    input_ids, input_masks, input_segments = return_id(instance, 'longest_first', max_sequence_length)\n",
    "    \n",
    "    return [input_ids, input_masks, input_segments]\n",
    "\n",
    "def compute_input_arrays(df, columns, tokenizer, max_sequence_length):\n",
    "    input_ids, input_masks, input_segments = [], [], []\n",
    "    for instance in tqdm(df[columns]):\n",
    "        \n",
    "        ids, masks, segments = _convert_to_transformer_inputs(str(instance), tokenizer, max_sequence_length)\n",
    "        \n",
    "        input_ids.append(ids)\n",
    "        input_masks.append(masks)\n",
    "        input_segments.append(segments)\n",
    "\n",
    "    return [np.asarray(input_ids, dtype=np.int32), \n",
    "            np.asarray(input_masks, dtype=np.int32), \n",
    "            np.asarray(input_segments, dtype=np.int32)\n",
    "           ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated\n",
      "100%|███████████████████████████████████████████████████████████████████████████| 99913/99913 [02:02<00:00, 815.76it/s]\n",
      "100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:12<00:00, 821.93it/s]\n"
     ]
    }
   ],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained(BERT_PATH+'bert-base-chinese-vocab.txt')\n",
    "inputs = compute_input_arrays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)\n",
    "test_inputs = compute_input_arrays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_output_arrays(df, columns):\n",
    "    return np.asarray(df[columns].astype(int) + 1)\n",
    "outputs = compute_output_arrays(df_train, output_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BERT模型\n",
    "<img src=\"https://imgkr.cn-bj.ufileos.com/9115ac01-f455-498b-8c38-9c4abb04046c.png\" width=\"50%\" height=\"50%\" />"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 修改模型\n",
    "- bert 修改\n",
    "- 加入 LSTM、GRU等作为Encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "class WeiboBERT(tf.keras.Model):\n",
    "    def __init__(self):\n",
    "        super(WeiboBERT, self).__init__(name='Weibo_bert')\n",
    "        config = BertConfig.from_pretrained(BERT_PATH + 'bert-base-chinese-config.json',output_hidden_states=True) \n",
    "        self.bert_model = TFBertModel.from_pretrained(BERT_PATH+'bert-base-chinese-tf_model.h5', config=config)\n",
    "        self.concat =  tf.keras.layers.Concatenate(axis=2)\n",
    "        self.avgpool = tf.keras.layers.GlobalAveragePooling1D()\n",
    "        self.dropout = tf.keras.layers.Dropout(0.15)\n",
    "        self.output_ = tf.keras.layers.Dense(3, activation='softmax')\n",
    "   \n",
    "        \n",
    "    def call(self, inputs):\n",
    "        input_id,input_mask,input_atn = inputs\n",
    "        sequence_output, pooler_output, hidden_states  = self.bert_model(input_id, attention_mask=input_mask, token_type_ids=input_atn)\n",
    "        h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,768))\n",
    "        h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,768))\n",
    "        h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,768))\n",
    "        h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,768)) \n",
    "        concat_hidden = self.concat(([h12, h11, h10, h09]))\n",
    "        x = self.avgpool(concat_hidden)\n",
    "        x = self.dropout(x)\n",
    "        x = self.output_(x)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gkf = StratifiedKFold(n_splits=5).split(X=df_train[input_categories].fillna('-1'), y=df_train[output_categories].fillna('-1'))\n",
    "\n",
    "valid_preds = []\n",
    "test_preds = []\n",
    "for fold, (train_idx, valid_idx) in enumerate(gkf):\n",
    "    train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]\n",
    "    train_outputs = to_categorical(outputs[train_idx])\n",
    "\n",
    "    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]\n",
    "    valid_outputs = to_categorical(outputs[valid_idx])\n",
    "\n",
    "    model = WeiboBERT()\n",
    "    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)\n",
    "    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc']) \n",
    "    \n",
    "    model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=2, batch_size=32)\n",
    "    test_preds.append(model.predict(test_inputs))\n",
    "    K.clear_session()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 修改损失函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Focal_Loss(y_true, y_pred, alpha=0.5, gamma=2):\n",
    "    \"\"\"\n",
    "    focal loss for multi-class classification\n",
    "    fl(pt) = -alpha*(1-pt)^(gamma)*log(pt)\n",
    "    :param y_true: ground truth one-hot vector shape of [batch_size, nb_class]\n",
    "    :param y_pred: prediction after softmax shape of [batch_size, nb_class]\n",
    "    :param alpha:\n",
    "    :param gamma:\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    y_pred += tf.keras.backend.epsilon()\n",
    "    ce = -y_true * tf.math.log(y_pred)\n",
    "    weight = tf.pow(1 - y_pred, gamma) * y_true\n",
    "    fl = ce * weight * alpha\n",
    "    reduce_fl = tf.keras.backend.max(fl, axis=-1)\n",
    "    return reduce_fl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gkf = StratifiedKFold(n_splits=5).split(X=df_train[input_categories].fillna('-1'), y=df_train[output_categories].fillna('-1'))\n",
    "\n",
    "valid_preds = []\n",
    "test_preds = []\n",
    "for fold, (train_idx, valid_idx) in enumerate(gkf):\n",
    "    train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]\n",
    "    train_outputs = to_categorical(outputs[train_idx])\n",
    "\n",
    "    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]\n",
    "    valid_outputs = to_categorical(outputs[valid_idx])\n",
    "\n",
    "    model = WeiboBERT()\n",
    "    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)\n",
    "    train_dataset = tf.data.Dataset.from_tensor_slices(((train_inputs[0],train_inputs[1],train_inputs[2]),train_outputs)).shuffle(buffer_size=1000).batch(32)\n",
    "    valid_dataset = tf.data.Dataset.from_tensor_slices(((valid_inputs[0],valid_inputs[1],valid_inputs[2]),valid_outputs)).batch(32)\n",
    "    \n",
    "    FL=lambda y_true,y_pred: Focal_Loss(y_true, y_pred, alpha=0.25, gamma=2)\n",
    "    \n",
    "    model.compile(loss=FL, optimizer=optimizer, metrics=['acc'])   \n",
    "    model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=2, batch_size=32)\n",
    "    test_preds.append(model.predict(test_inputs))\n",
    "    K.clear_session()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 对抗训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://kexue.fm/archives/7234 \n",
    "\n",
    "ADVERSARIAL TRAINING METHODS FOR SEMI-SUPERVISED TEXT CLASSIFICATION (https://arxiv.org/pdf/1605.07725.pdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def search_layer(inputs, name, exclude_from=None):\n",
    "    \"\"\"根据inputs和name来搜索层\n",
    "    说明：inputs为某个层或某个层的输出；name为目标层的名字。\n",
    "    实现：根据inputs一直往上递归搜索，直到发现名字为name的层为止；\n",
    "         如果找不到，那就返回None。\n",
    "    \"\"\"\n",
    "    if exclude_from is None:\n",
    "        exclude_from = set()\n",
    "\n",
    "    if isinstance(inputs, tf.keras.layers.Layer):\n",
    "        layer = inputs\n",
    "    else:\n",
    "        layer = inputs._keras_history[0]\n",
    "\n",
    "    if layer.name == name:\n",
    "        return layer\n",
    "    elif layer in exclude_from:\n",
    "        return None\n",
    "    else:\n",
    "        exclude_from.add(layer)\n",
    "        if isinstance(layer, tf.keras.models.Model):\n",
    "            model = layer\n",
    "            for layer in model.layers:\n",
    "                if layer.name == name:\n",
    "                    return layer\n",
    "        inbound_layers = layer._inbound_nodes[0].inbound_layers\n",
    "        if not isinstance(inbound_layers, list):\n",
    "            inbound_layers = [inbound_layers]\n",
    "        if len(inbound_layers) > 0:\n",
    "            for layer in inbound_layers:\n",
    "                layer = search_layer(layer, name, exclude_from)\n",
    "                if layer is not None:\n",
    "                    return layer\n",
    "\n",
    "def sparse_categorical_crossentropy(y_true, y_pred):\n",
    "    \"\"\"自定义稀疏交叉熵\n",
    "    这主要是因为keras自带的sparse_categorical_crossentropy不支持求二阶梯度。\n",
    "    \"\"\"\n",
    "    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])\n",
    "    y_true = K.cast(y_true, 'int32')\n",
    "    y_true = K.one_hot(y_true, K.shape(y_pred)[-1])\n",
    "    return K.categorical_crossentropy(y_true, y_pred)\n",
    "\n",
    "\n",
    "def loss_with_gradient_penalty(y_true, y_pred, epsilon=1):\n",
    "    \"\"\"带梯度惩罚的loss\n",
    "    \"\"\"\n",
    "    loss = K.mean(sparse_categorical_crossentropy(y_true, y_pred))\n",
    "    embeddings = search_layer(y_pred, 'Embedding-Token').embeddings\n",
    "    gp = K.sum(K.gradients(loss, [embeddings])[0].values**2)\n",
    "    return loss + 0.5 * epsilon * gp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gkf = StratifiedKFold(n_splits=5).split(X=df_train[input_categories].fillna('-1'), y=df_train[output_categories].fillna('-1'))\n",
    "\n",
    "valid_preds = []\n",
    "test_preds = []\n",
    "for fold, (train_idx, valid_idx) in enumerate(gkf):\n",
    "    train_inputs = [inputs[i][train_idx] for i in range(len(inputs))]\n",
    "    train_outputs = to_categorical(outputs[train_idx])\n",
    "\n",
    "    valid_inputs = [inputs[i][valid_idx] for i in range(len(inputs))]\n",
    "    valid_outputs = to_categorical(outputs[valid_idx])\n",
    "\n",
    "    model = WeiboBERT()\n",
    "    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)\n",
    "    train_dataset = tf.data.Dataset.from_tensor_slices(((train_inputs[0],train_inputs[1],train_inputs[2]),train_outputs)).shuffle(buffer_size=1000).batch(32)\n",
    "    valid_dataset = tf.data.Dataset.from_tensor_slices(((valid_inputs[0],valid_inputs[1],valid_inputs[2]),valid_outputs)).batch(32)\n",
    "    \n",
    "    \n",
    "    model.compile(loss=loss_with_gradient_penalty, optimizer=optimizer, metrics=['acc'])   \n",
    "    model.fit(train_dataset, validation_data= valid_dataset, epochs=2)\n",
    "    test_preds.append(model.predict(test_inputs))\n",
    "    K.clear_session()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
